In [2]:
% load_ext autoreload
% autoreload 2
import numpy as np
from random import *
from dmww_classes import *
from plotting_helper import *
from sampling_helper import *
seed(1) # for debugging
Each wurwur (words world model) instance depends on a world, a corpus, and a lexicon - learned by the model - that connects the two.
In [9]:
w = World(n_words=8,n_objs=8)
w.show()
c = Corpus(world=w, n_sents=40, n_per_sent=2)
c.show()
Check the lexicon counts via co-occurrence:
In [10]:
l = CoocLexicon(world=w)
l.learn_lex(c)
lexplot(l, w)
Now try this with the Gibbs sampler (the actual model):
In [11]:
p = Params(n_samps=20,
alpha_r=.1,
alpha_nr=10,
empty_intent=.0001,
n_hypermoves=5)
l = GibbsLexicon(c,p,verbose=0,hyper_inf=True)
l.learn_lex(c, p)
lexplot(l, w, certainwords = 0)
l.ref
Out[11]:
In [12]:
w = World(corpus = 'corpora/corpus_toy.csv')
w.show()
c = Corpus(world=w, corpus = 'corpora/corpus_toy.csv')
c.show()
In [14]:
l = CoocLexicon(w)
l.learn_lex(c)
l.show()
In [15]:
l = GibbsLexicon(c, p,
verbose=0,
hyper_inf=True)
l.learn_lex(c,p)
lexplot(l,w,certainwords = 0)
l.params.show()
In [33]:
corpusfile = 'corpora/corpus.csv'
w = World(corpus=corpusfile)
w.show()
c = Corpus(world=w, corpus=corpusfile)
In [49]:
p = Params(n_samps=100,
alpha_r=.1,
alpha_nr=10,
empty_intent=.0001,
n_hypermoves=5)
l = GibbsLexicon(c, p,
verbose=0,
hyper_inf=True)
l.learn_lex(c,p)
lexplot(l,w)
In [50]:
for o in range(w.n_objs):
wd = where(l.ref[o,:] == max(l.ref[o,:]))
print "o: %s, w: %s" % (w.objs_dict[o][0], w.words_dict[wd[0][0]][0])
Note that f-score here is assessed via ANY non-zero counts.
We should think about the best way to do this for corpora
In [54]:
corpusfile = 'corpora/gold_standard.csv'
c_gs = Corpus(world=w, corpus = corpusfile)
get_f(l.ref, c_gs)
In [ ]: